rm(list=ls())

library(survival)
library(reshape2)
library(countrycode)
library(tidyverse)
library(lfe)
library(estprod)

load('binary_distance_globalization.RData')

undergrad_predicted <- read.csv('kw_relevance_predicted_set.csv', stringsAsFactors = FALSE)
undergrad_predicted <- undergrad_predicted %>% select(-starts_with('x')) %>% filter(Predicted == 1)
undergrad_predicted$self_mention <- grepl('[Oo]ur ([a-zA-Z0-9_]+ ){0,2}(border|boundar)', undergrad_predicted$text, perl=TRUE)

self_mentions <- undergrad_predicted %>% filter(self_mention == TRUE) %>% select(country, year) %>% distinct %>% dplyr::rename(country_1 = country) %>% 
  mutate(country_2 = country_1, mention = 1, conttype = 1)
self_mentions[,names(data_binary)[!names(data_binary) %in% names(self_mentions)]] <- NA

data_binary <- bind_rows(data_binary, self_mentions)

data_binary$region_1 <- countrycode(data_binary$country_1, 'iso3c', 'un.regionsub.name')
data_binary$region_1[data_binary$country_1 == 'CSK'] <- 'Eastern Europe'
data_binary$region_1[data_binary$country_1 == 'DDR'] <- 'Eastern Europe'
data_binary$region_1[data_binary$country_1 == 'GE-AB'] <- 'Western Asia'
data_binary$region_1[data_binary$country_1 == 'TWN'] <- 'Eastern Asia'
data_binary$region_1[data_binary$country_1 == 'XKX'] <- 'Southern Europe'
data_binary$region_1[data_binary$country_1 == 'YDM'] <- 'Western Asia'
data_binary$region_1[data_binary$country_1 == 'YUG'] <- 'Southern Europe'

data_binary$region_2 <- countrycode(data_binary$country_2, 'iso3c', 'un.regionsub.name')
data_binary$region_2[data_binary$country_2 == 'CSK'] <- 'Eastern Europe'
data_binary$region_2[data_binary$country_2 == 'DDR'] <- 'Eastern Europe'
data_binary$region_2[data_binary$country_2 == 'GE-AB'] <- 'Western Asia'
data_binary$region_2[data_binary$country_2 == 'TWN'] <- 'Eastern Asia'
data_binary$region_2[data_binary$country_2 == 'XKX'] <- 'Southern Europe'
data_binary$region_2[data_binary$country_2 == 'YDM'] <- 'Western Asia'
data_binary$region_2[data_binary$country_2 == 'YUG'] <- 'Southern Europe'

undergrad_predicted$region <- countrycode(undergrad_predicted$country, 'iso3c', 'un.regionsub.name')
undergrad_predicted$region[undergrad_predicted$region == 'CSK'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$region == 'DDR'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$region == 'GE-AB'] <- 'Western Asia'
undergrad_predicted$region[undergrad_predicted$region == 'TWN'] <- 'Eastern Asia'
undergrad_predicted$region[undergrad_predicted$region == 'XKX'] <- 'Southern Europe'
undergrad_predicted$region[undergrad_predicted$region == 'YDM'] <- 'Western Asia'
undergrad_predicted$region[undergrad_predicted$region == 'YUG'] <- 'Southern Europe'

continent_recode <- c(`Eastern Europe` = 'Europe',
                      `Western Europe` = 'Europe',
                      `Southern Europe` = 'Europe',
                      `Northern Europe` = 'Europe',
                      `Eastern Asia` = 'Asia',
                      `South-eastern Asia` = 'Asia',
                      `Southern Asia` = 'Asia',
                      `Centeral Asia` = 'Asia',
                      `Western Asia` = 'MENA',
                      `Northern Africa` = 'MENA',
                      `Sub-Saharan Africa` = 'Africa',
                      `Latin America and the Caribbean` = 'Americas',
                      `Northern America` = 'Americas',
                      `Polynesia` = 'Oceania',
                      `Micronesia` = 'Oceania',
                      `Melanesia` = 'Oceania',
                      `Australia and New Zealand` = 'Oceania')

data_binary$continent_1 <- recode(data_binary$region_1, 
                                  !!!continent_recode)

data_binary$continent_2 <- recode(data_binary$region_2, 
                                  !!!continent_recode)

# recoding unmatched place mentions to NA
data_binary$country_2[is.na(data_binary$region_2)] <- NA

undergrad_predicted$region <- countrycode(undergrad_predicted$country, 'iso3c', 'un.regionsub.name')
undergrad_predicted$region[undergrad_predicted$country == 'CSK'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$country == 'DDR'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$country == 'YDYE'] <- 'Western Asia'
undergrad_predicted$region[undergrad_predicted$country == 'YUG'] <- 'Southern Europe'

undergrad_predicted$continent <- recode(undergrad_predicted$region,
                                 !!!continent_recode)

data_binary$conttype[data_binary$country_1 == data_binary$country_2] <- 1

# Figure C1
region_1 <- data_binary %>% group_by(region_1) %>% dplyr::summarise(Mentions = sum(mention))
keep_1 <- region_1$region_1[region_1$Mentions >= 0]
region_year_1 <- data_binary %>% group_by(region_1, year, continent_1) %>% dplyr::summarise(Mentions = sum(mention), Mention_Prop = mean(mention)) %>% 
  filter(region_1 %in% keep_1, !is.na(continent_1))

region_2 <- data_binary %>% group_by(region_2) %>% dplyr::summarise(Mentions = sum(mention))
keep_2 <- region_2$region_2[region_2$Mentions >= 0]
region_year_2 <- data_binary %>% group_by(region_2, year, continent_2) %>% dplyr::summarise(Mentions = sum(mention), Mention_Prop = mean(mention)) %>% 
  filter(region_2 %in% keep_2, !is.na(continent_2))

combined <- bind_rows(region_year_1 %>% ungroup() %>% select(-region_1) %>% mutate(Type = 'Speaker') %>% dplyr::rename(Continent = continent_1),
                      region_year_2 %>% ungroup() %>% select(-region_2) %>% mutate(Type = 'Target') %>% dplyr::rename(Continent = continent_2))

ggplot(combined, aes(x=year, y=Mentions, color=Type)) + geom_smooth(se=TRUE) + facet_wrap(~Continent) + 
  theme_minimal() + theme(legend.position=c(0.7, 0.15)) 
